WildfireData <- read.csv('final_wildfire.csv')
summary_nature=read.csv('summary_nature.csv')
summary_peoplecaused=read.csv('summary_peoplecaused.csv')
fire_budget  <- read.csv("fire_suppression.csv")
Avg_Temp <- WildfireData$tair_day_livneh_vic
Avg_SoilMoisture <- WildfireData$soilmoist1_day_livneh_vic
Avg_Rainfall <- WildfireData$rainfall_day_livneh_vic

##Five Point Summary

temp=str_remove(fire_budget$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
fire_budget$Budget=as.numeric(temp)

temp=str_remove(WildfireData$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
WildfireData$Budget=as.numeric(temp)

xkablesummary(WildfireData)
## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length

## Warning in stri_replace_all_regex(string, pattern,
## fix_replacement(replacement), : longer object length is not a multiple of
## shorter object length
Table: Statistics summary.
X Year DISCOVERY_DOY Budget DISCOVERY_DATE STAT_CAUSE_CODE STAT_CAUSE_DESCR CONT_DATE CONT_DOY FIRE_SIZE FIRE_SIZE_CLASS STATE existDay tair_day_livneh_vic month soilmoist1_day_livneh_vic rainfall_day_livneh_vic
Min Min. : 1 Min. :1992 Min. : 1.0 Min. : 43800000 Min. :2448622 Min. : 1.000 Length:189550 Min. :2448622 Min. : 1.0 Min. : 0.00 Length:189550 Length:189550 Min. : 0.00 Min. :-3.95 Min. : 1.000 Min. :10.51 Min. : 0.000
Q1 1st Qu.: 47388 1st Qu.:1997 1st Qu.:164.0 1st Qu.: 85591000 1st Qu.:2450624 1st Qu.: 2.000 Class :character 1st Qu.:2451362 1st Qu.:169.0 1st Qu.: 0.10 Class :character Class :character 1st Qu.: 0.00 1st Qu.:16.20 1st Qu.: 6.000 1st Qu.:11.27 1st Qu.: 0.005
Median Median : 94776 Median :2003 Median :202.0 Median :166000000 Median :2452860 Median : 5.000 Mode :character Median :2453248 Median :206.0 Median : 0.25 Mode :character Mode :character Median : 0.00 Median :20.51 Median : 7.000 Median :12.09 Median : 0.072
Mean Mean : 94776 Mean :2003 Mean :201.6 Mean :205045692 Mean :2452884 Mean : 5.691 NA Mean :2453347 Mean :203.6 Mean : 67.24 NA NA Mean : 1.05 Mean :18.87 Mean : 7.181 Mean :12.97 Mean : 0.404
Q3 3rd Qu.:142163 3rd Qu.:2009 3rd Qu.:245.0 3rd Qu.:252000000 3rd Qu.:2455034 3rd Qu.: 9.000 NA 3rd Qu.:2455763 3rd Qu.:246.0 3rd Qu.: 1.00 NA NA 3rd Qu.: 0.00 3rd Qu.:22.71 3rd Qu.: 9.000 3rd Qu.:13.80 3rd Qu.: 0.395
Max Max. :189550 Max. :2015 Max. :366.0 Max. :608000000 Max. :2457388 Max. :13.000 NA Max. :2457388 Max. :366.0 Max. :315578.80 NA NA Max. :3653.00 Max. :28.71 Max. :12.000 Max. :28.52 Max. :31.959
NA NA NA NA NA NA NA NA NA’s :97642 NA’s :97642 NA NA NA NA’s :97642 NA’s :13859 NA’s :13859 NA’s :13859 NA’s :13859

HISTOGRAMS

library(ggplot2)
library(gridExtra)
#Average Temperature
TempHist <- ggplot(WildfireData, aes(Avg_Temp))+ 
  geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="light blue 2") +
  labs(x="Avg. Temp (C)", y="Frequency", title="HISTOGRAM: Average Temprature") 


#Average Soil Moisture
SoilHist <- ggplot(WildfireData, aes(Avg_SoilMoisture))+ 
  geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="orange red 2") +
  labs(x="Avg. Soil Moisture", y="Frequency", title="HISTOGRAM: Average Soil Moisture") 

#Average Rainfall
RainHist <- ggplot(WildfireData, aes(Avg_Rainfall))+ 
  geom_histogram(binwidth = 0.5, bins = 100, col="black", fill="green 3") +
  labs(x="Avg. Rainfall", y="Frequency", title="HISTOGRAM: Average Rainfall") 

#Wildfire Count by Year
CountHist <- ggplot(WildfireData, aes(Year))+ 
  geom_histogram(binwidth = 0.10, bins = 100, col="black", fill="yellow", stat="count") +
  labs(x="Years", y="Frequency of Wildfires", title="Wildfires count by year", )
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Histograms <- grid.arrange(TempHist, SoilHist, RainHist, CountHist, ncol=2, nrow=2)
## Warning: Removed 13859 rows containing non-finite values (stat_bin).
## Warning: Removed 13859 rows containing non-finite values (stat_bin).

## Warning: Removed 13859 rows containing non-finite values (stat_bin).

ggsave("Histograms.jpg", plot = Histograms)
## Saving 7 x 5 in image

##Bar Graphs

#Fire Size
FireBar <- ggplot(data = WildfireData, aes(x = FIRE_SIZE_CLASS)) +
  geom_bar(col="black", fill="orange")+
  labs(x="Fire Size Class", y="Frequency", title="Frequency of Wildfires by Size Classes") 


#Years
YearsBar <- ggplot(data = WildfireData, aes(x = Year)) +
  geom_bar(col="black", fill="yellow")+
  labs(x="Years", y="Frequency", title="Frequency of Wildfires by Year")

#Budget
BudgetBar <- ggplot(data = WildfireData, aes(x = Budget)) +
  geom_bar(col="black", fill="Pink 2")+
  labs(x="Budget", y="Frequency", title="Frequency of Wildfires by Budget")


grid.arrange(FireBar, YearsBar, nrow=2)

##Pie Charts

lbls <- c("A", "B", "C", "D", "E", "F", "G", "E")
jpeg("sizeclasspie.jpeg")
sizeclasspie<- pie((table(WildfireData$FIRE_SIZE_CLASS)), col=rainbow(length(lbls)), main="Pie Chart of Fire Size Class")
jpeg("cause_descrpPie.jpeg")

lbls <- c("Lightning", "Eqipment Use", "Smoking", "Campfire", "Debris Burning", "Railroad", "Arson", "Children", "Misc." )
cause_descrpPie <- pie((table(WildfireData$STAT_CAUSE_DESCR)), col=rainbow(length(lbls)), main="Pie Chart of Wildfire Cause")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
yearly_count <- WildfireData %>% count(Year)
colnames(yearly_count) <- c("Year", "Count")
ggplot(yearly_count, aes(x=Year, y=Count, group=1)) + geom_line() + ggtitle("Yearly Recorded Fires")

temp=str_remove(fire_budget$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
fire_budget$Budget=as.numeric(temp)
ggplot(fire_budget, aes(x=Year, y=Budget, group=1)) + geom_line() + ggtitle("California Fire Suppression Budget 1979-2021")

dat <- aggregate(FIRE_SIZE ~ Year, WildfireData, mean)
ggplot(dat, aes(x=Year, y=FIRE_SIZE, group=1)) + geom_line() + ggtitle("Wildfire Sizes (1992-2013)") + ylab("Fire Size")

R Markdown

Hu Zhongyang part intialize data

final_fire=read.csv('final_wildfire.csv')
summary_nature=read.csv('summary_nature.csv')
summary_peoplecaused=read.csv('summary_peoplecaused.csv')
colnames(summary_nature)[4]='temperature'
colnames(summary_nature)[5]='soilmoisture'
colnames(summary_nature)[6]='rainfall'
colnames(summary_nature)[7]='nfire'
colnames(summary_peoplecaused)[4]='temperature'
colnames(summary_peoplecaused)[5]='soilmoisture'
colnames(summary_peoplecaused)[6]='rainfall'
colnames(summary_peoplecaused)[7]='nfire'
summary_peoplecaused$Year=as.factor(summary_peoplecaused$Year)

summary_peoplecaused$month=as.factor(summary_peoplecaused$month)
summary_nature$Year=as.factor(summary_nature$Year)

summary_nature$month=as.factor(summary_nature$month)

Including Plots

plot the year trend

library(ggplot2)

temp_plot=aggregate(nfire~Year,summary_nature,sum)

temp_plot2=aggregate(nfire~Year,summary_peoplecaused,sum)

ggplot() +geom_point(data=temp_plot, aes(x=Year, y=nfire), colour='blue') + geom_point(data=temp_plot2, aes(x=Year, y=nfire),colour='red')+labs(title='Number of Fires Each Year (Red for people-caused, Blue for other reasons)',y='Number of Fires')

plot the boxplot of the year and month to show trend

library(ggpubr)
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
ggplot(summary_peoplecaused, mapping=aes(x=Year,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of people-caused fires for different years')+ylab('Number of Fires')

ggplot(summary_peoplecaused, mapping=aes(x=month,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of people-caused fires for different months')+ylab('Number of Fires')

ggplot(summary_nature, mapping=aes(x=Year,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of fires caused by other reasons for different years')+ylab('Number of Fires')

ggplot(summary_nature, mapping=aes(x=month,y=nfire)) + geom_boxplot()+ggtitle('box-plot of number of fires caused by other reasons for different months')+ylab('Number of Fires')

ggplot(summary_peoplecaused, mapping=aes(x=Year,y=temperature)) + geom_boxplot()+ggtitle('box-plot of temperature for different years')+ylab('temperature')

ggplot(summary_peoplecaused, mapping=aes(x=month,y=temperature)) + geom_boxplot()+ggtitle('box-plot of temperature for different months')+ylab('temperature')

ggplot(summary_nature, mapping=aes(x=Year,y=soilmoisture)) + geom_boxplot()+ggtitle('box-plot of soil moisture for different years')+ylab('soil moisture')

ggplot(summary_nature, mapping=aes(x=month,y=soilmoisture)) + geom_boxplot()+ggtitle('box-plot of soil moisture for different months')+ylab('soil moisture')

ggplot(summary_nature, mapping=aes(x=Year,y=rainfall)) + geom_boxplot()+ggtitle('box-plot of rainfall for different years')+ylab('average daily rainfall')

ggplot(summary_nature, mapping=aes(x=month,y=rainfall)) + geom_boxplot()+ggtitle('box-plot of rainfall for different months')+ylab('average daily rainfall')

Setting up different groups for the first hypothesis test. We are looking to compare different classes of wildfires and how different conditions may have been.

classA <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'A',]
classB <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'B',]
classC <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'C',]
classD <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'D',]
classE <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'E',]
classF <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'F',]
classG <- WildfireData[WildfireData$FIRE_SIZE_CLASS == 'G',]

When comparing the conditions during the smallest wildfires to the largest wildfires, it appears that air temperature was lower, soil moisture was higher, and rainfall was higher during less intense wildfires.

t.test(classA$tair_day_livneh_vic, classG$tair_day_livneh_vic, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  classA$tair_day_livneh_vic and classG$tair_day_livneh_vic
## t = -8.7583, df = 89797, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.195132 -2.026586
## sample estimates:
## mean of x mean of y 
##  18.70072  21.31158
t.test(classA$soilmoist1_day_livneh_vic, classG$soilmoist1_day_livneh_vic, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  classA$soilmoist1_day_livneh_vic and classG$soilmoist1_day_livneh_vic
## t = 8.175, df = 89797, p-value = 2.998e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.8259187 1.3468515
## sample estimates:
## mean of x mean of y 
##  13.08929  12.00291
t.test(classA$rainfall_day_livneh_vic, classG$rainfall_day_livneh_vic, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  classA$rainfall_day_livneh_vic and classG$rainfall_day_livneh_vic
## t = 4.4724, df = 89797, p-value = 7.744e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1484135 0.3799739
## sample estimates:
## mean of x mean of y 
## 0.4636929 0.1994992
code1 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 1,]
code2 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 2,]
code7 <- WildfireData[WildfireData$STAT_CAUSE_CODE == 7,]

Now, we will compare different groups of wildfires- categorized by their causes

Code 1: Lightning Code 2: Equipment Use Code 7: Arson

When looking at the wildfires caused by Lightning versus those caused by Equipment Use, average temperature, soil moisture, and rainfall in CA were significantly different. In particular, during Lighting-caused wildfires, air temperature was higher, soil moisture was lower, and rainfall was higher.

t.test(code1$tair_day_livneh_vic, code2$tair_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$tair_day_livneh_vic and code2$tair_day_livneh_vic
## t = 85.583, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.936384 3.074032
## sample estimates:
## mean of x mean of y 
##  22.10035  19.09514
t.test(code1$soilmoist1_day_livneh_vic, code2$soilmoist1_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$soilmoist1_day_livneh_vic and code2$soilmoist1_day_livneh_vic
## t = -38.009, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6264540 -0.5650133
## sample estimates:
## mean of x mean of y 
##  12.19398  12.78972
t.test(code1$rainfall_day_livneh_vic, code2$rainfall_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$rainfall_day_livneh_vic and code2$rainfall_day_livneh_vic
## t = 30.919, df = 62967, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2152689 0.2444081
## sample estimates:
## mean of x mean of y 
## 0.5694914 0.3396529

When comparing the wildfires caused by Lightning versus those caused by Arson, it appears that the air temperature, soil moisture, and average rainfall in CA were significantly different. In particular, during lightning-caused wildfires, air temperature was higher, soil moisture was lower, and average rainfall was higher.

t.test(code1$tair_day_livneh_vic, code7$tair_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$tair_day_livneh_vic and code7$tair_day_livneh_vic
## t = 82.448, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.208857 3.365139
## sample estimates:
## mean of x mean of y 
##  22.10035  18.81335
t.test(code1$soilmoist1_day_livneh_vic, code7$soilmoist1_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$soilmoist1_day_livneh_vic and code7$soilmoist1_day_livneh_vic
## t = -39.235, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7279409 -0.6586716
## sample estimates:
## mean of x mean of y 
##  12.19398  12.88729
t.test(code1$rainfall_day_livneh_vic, code7$rainfall_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code1$rainfall_day_livneh_vic and code7$rainfall_day_livneh_vic
## t = 26.917, df = 43091, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2019315 0.2336487
## sample estimates:
## mean of x mean of y 
## 0.5694914 0.3517013

When looking at conditions during arson-caused wildfires versus equipment use-caused wildfires, it appears that air temperature and soil moisture were significantly different. In particular, during arson-caused fires, air temperature was lower and soil moisture was higher.

t.test(code7$tair_day_livneh_vic, code2$tair_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code7$tair_day_livneh_vic and code2$tair_day_livneh_vic
## t = -6.3054, df = 56768, p-value = 2.895e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3693827 -0.1941972
## sample estimates:
## mean of x mean of y 
##  18.81335  19.09514
t.test(code7$soilmoist1_day_livneh_vic, code2$soilmoist1_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code7$soilmoist1_day_livneh_vic and code2$soilmoist1_day_livneh_vic
## t = 4.7886, df = 56768, p-value = 1.684e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05763517 0.13750998
## sample estimates:
## mean of x mean of y 
##  12.88729  12.78972
t.test(code7$rainfall_day_livneh_vic, code2$rainfall_day_livneh_vic, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  code7$rainfall_day_livneh_vic and code2$rainfall_day_livneh_vic
## t = 1.3998, df = 56768, p-value = 0.1616
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.004821767  0.028918604
## sample estimates:
## mean of x mean of y 
## 0.3517013 0.3396529

Do Anova test on the year and month statistics

summary(aov(nfire~Year,summary_peoplecaused))
##              Df  Sum Sq Mean Sq F value Pr(>F)
## Year         21  360393   17162   0.849  0.657
## Residuals   242 4892242   20216
summary(aov(nfire~month,summary_peoplecaused))
##              Df  Sum Sq Mean Sq F value Pr(>F)    
## month        11 4167871  378897   88.02 <2e-16 ***
## Residuals   252 1084764    4305                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(nfire~Year,summary_nature))
##              Df   Sum Sq Mean Sq F value Pr(>F)
## Year         21  1352086   64385   0.608  0.911
## Residuals   242 25626302  105894
summary(aov(nfire~month,summary_nature))
##              Df   Sum Sq Mean Sq F value Pr(>F)    
## month        11 21164133 1924012   83.39 <2e-16 ***
## Residuals   252  5814256   23072                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(temperature~Year,summary_peoplecaused))
##              Df Sum Sq Mean Sq F value Pr(>F)
## Year         21     31    1.47   0.033      1
## Residuals   242  10855   44.85
summary(aov(temperature~month,summary_peoplecaused))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## month        11  10458   950.7   560.1 <2e-16 ***
## Residuals   252    428     1.7                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(soilmoisture~Year,summary_nature))
##              Df Sum Sq Mean Sq F value Pr(>F)
## Year         21  129.9   6.185   0.491  0.972
## Residuals   242 3048.5  12.597
summary(aov(soilmoisture~month,summary_nature))
##              Df Sum Sq Mean Sq F value Pr(>F)    
## month        11 2616.9  237.90   106.8 <2e-16 ***
## Residuals   252  561.5    2.23                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(rainfall~Year,summary_nature))
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Year         21  34.08   1.623   1.458 0.0931 .
## Residuals   242 269.34   1.113                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(rainfall~month,summary_nature))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## month        11  95.17   8.652   10.47 8.85e-16 ***
## Residuals   252 208.25   0.826                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

try to make correlation check with numeric variable

temp=str_remove(summary_nature$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
summary_nature$Budget=as.numeric(temp)
temp=str_remove(summary_peoplecaused$Budget,"[$]")
temp=str_remove_all(temp,"[,]")
summary_peoplecaused$Budget=as.numeric(temp)
cor_nature=cor(summary_nature[c(4:9)])

library(corrplot)
## corrplot 0.84 loaded
corrplot(cor_nature,method='number')

cor_people=cor(summary_peoplecaused[c(4:9)])


corrplot(cor_people,method='number')

summary_nature$total=summary_nature$n+summary_peoplecaused$n
cor_total=cor(summary_nature[c(4,5,6,8,9,10)])


corrplot(cor_total,method='number',type = 'lower', diag = TRUE)

create model for nature reason and check their summary, use vif to determine the variable useage

use residual plot and qq-plot to check their normality

model1=lm(total~temperature,data=summary_nature)
summary(model1)
## 
## Call:
## lm(formula = total ~ temperature, data = summary_nature)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -487.32 -156.40  -17.98  115.94  786.54 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -364.702     31.780  -11.48   <2e-16 ***
## temperature   60.159      2.077   28.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 216.7 on 262 degrees of freedom
## Multiple R-squared:  0.7621, Adjusted R-squared:  0.7612 
## F-statistic: 839.3 on 1 and 262 DF,  p-value: < 2.2e-16
model2=lm(total~temperature+soilmoisture,data=summary_nature)
plot(model2)

summary(model2)
## 
## Call:
## lm(formula = total ~ temperature + soilmoisture, data = summary_nature)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -451.32 -143.28  -25.17  113.08  777.87 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   210.385    160.014   1.315 0.189733    
## temperature    48.050      3.878  12.389  < 2e-16 ***
## soilmoisture  -26.296      7.178  -3.664 0.000301 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 211.7 on 261 degrees of freedom
## Multiple R-squared:  0.7737, Adjusted R-squared:  0.772 
## F-statistic: 446.2 on 2 and 261 DF,  p-value: < 2.2e-16
model3=lm(total~temperature+soilmoisture+rainfall,data=summary_nature)
summary(model3)
## 
## Call:
## lm(formula = total ~ temperature + soilmoisture + rainfall, data = summary_nature)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -436.00 -146.77  -19.03  108.12  770.88 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   295.033    171.473   1.721 0.086517 .  
## temperature    47.261      3.915  12.070  < 2e-16 ***
## soilmoisture  -32.356      8.441  -3.833 0.000159 ***
## rainfall       22.824     16.798   1.359 0.175424    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 211.4 on 260 degrees of freedom
## Multiple R-squared:  0.7753, Adjusted R-squared:  0.7727 
## F-statistic: 299.1 on 3 and 260 DF,  p-value: < 2.2e-16
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(model3)
##  temperature soilmoisture     rainfall 
##     3.735761     5.069232     1.916655
vif(model2)
##  temperature soilmoisture 
##     3.653691     3.653691
model4=lm(total~soilmoisture,data=summary_nature)

summary(model4)
## 
## Call:
## lm(formula = total ~ soilmoisture, data = summary_nature)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -592.40 -176.14  -37.79  118.51  957.21 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2050.416     74.892   27.38   <2e-16 ***
## soilmoisture -102.079      4.723  -21.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 266.3 on 262 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 467.1 on 1 and 262 DF,  p-value: < 2.2e-16
model5=lm(log(total)~temperature+soilmoisture,data=summary_nature)
residualPlot(model5)

plot(model5)

summary(model5)
## 
## Call:
## lm(formula = log(total) ~ temperature + soilmoisture, data = summary_nature)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92313 -0.27901 -0.02392  0.23857  1.31414 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.44582    0.30364  27.815  < 2e-16 ***
## temperature   0.05834    0.00736   7.926 6.59e-14 ***
## soilmoisture -0.23920    0.01362 -17.562  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4017 on 261 degrees of freedom
## Multiple R-squared:  0.8949, Adjusted R-squared:  0.8941 
## F-statistic:  1112 on 2 and 261 DF,  p-value: < 2.2e-16
vif(model5)
##  temperature soilmoisture 
##     3.653691     3.653691